# Basic Packages
from __future__ import division
import os
from datetime import datetime
# Web & file access
import requests
import io
# Import display options for showing websites
from IPython.display import IFrame, HTML
#!pip install plotnine
# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%pylab --no-import-all
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
import plotly.express as px
import plotly.graph_objects as go
from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
# Next line can import all of plotnine, but may overwrite things? Better import each function/object you need
#from plotnine import *
Using matplotlib backend: <object object at 0x7fb3aff876e0> %pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
# !pip install geoplot
# Data
import pandas as pd
import numpy as np
from pandas_datareader import data, wb
# GIS & maps
import geopandas as gpd
gp = gpd
# import georasters as gr
# import geoplot as gplt
# import geoplot.crs as gcrs
import mapclassify as mc
#import textwrap
!pip install mapclassify
Collecting mapclassify Using cached mapclassify-2.4.3-py3-none-any.whl (38 kB) Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.9/site-packages (from mapclassify) (1.0.2) Requirement already satisfied: numpy>=1.3 in /opt/anaconda3/lib/python3.9/site-packages (from mapclassify) (1.21.5) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.9/site-packages (from mapclassify) (2.7.1) Requirement already satisfied: scipy>=1.0 in /opt/anaconda3/lib/python3.9/site-packages (from mapclassify) (1.7.3) Requirement already satisfied: pandas>=1.0 in /opt/anaconda3/lib/python3.9/site-packages (from mapclassify) (1.4.2) Requirement already satisfied: python-dateutil>=2.8.1 in /opt/anaconda3/lib/python3.9/site-packages (from pandas>=1.0->mapclassify) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.9/site-packages (from pandas>=1.0->mapclassify) (2021.3) Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas>=1.0->mapclassify) (1.16.0) Requirement already satisfied: joblib>=0.11 in /opt/anaconda3/lib/python3.9/site-packages (from scikit-learn->mapclassify) (1.1.0) Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/lib/python3.9/site-packages (from scikit-learn->mapclassify) (2.2.0) Installing collected packages: mapclassify Successfully installed mapclassify-2.4.3
import mapclassify as mc
# !pip install stargazer
# Data Munging
from itertools import product, combinations
# import difflib
# import pycountry
# import geocoder
# from geonamescache.mappers import country
# mapper = country(from_key='name', to_key='iso3')
# mapper2 = country(from_key='iso3', to_key='iso')
# mapper3 = country(from_key='iso3', to_key='name')
# Regressions & Stats
from scipy.stats import norm
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer, LineLocation
# Paths
pathout = './data/'
if not os.path.exists(pathout):
os.mkdir(pathout)
pathgraphs = './graphs/'
if not os.path.exists(pathgraphs):
os.mkdir(pathgraphs)
currentYear = datetime.now().year
year = min(2020, currentYear-2)
Search for the variable you are interested in
(e.g., GDP per capita, PPP (constant 2017 international $))
The link will become https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.KD
(if feasible it will display a figure)
url = 'https://data.worldbank.org/share/widget?indicators=NY.GDP.PCAP.PP.KD'
IFrame(url, width=500, height=300)
Suboptimal: Download from the website
(Not best approach, since you need to do it for every variable every time data is updated)
url = 'https://pandas-datareader.readthedocs.io/en/latest/remote_data.html#remote-data-wb'
IFrame(url, width=800, height=400)
from pandas_datareader import data, wb
wbcountries = wb.get_countries()
# If you want to keep aggregate data for regions or world comment out next line
wbcountries = wbcountries.loc[wbcountries.region.isin(['Aggregates'])==False].reset_index(drop=True)
wbcountries['name'] = wbcountries.name.str.strip()
wbcountries['incomeLevel'] = wbcountries['incomeLevel'].str.title()
wbcountries.loc[wbcountries.iso3c=='VEN', 'incomeLevel'] = 'Upper Middle Income'
wbcountries = wb.get_countries()
wbcountries = wbcountries.loc[wbcountries.region.isin(['Aggregates'])==False].reset_index(drop=True)
wbcountries['name'] = wbcountries.name.str.strip()
wbcountries['incomeLevel'] = wbcountries['incomeLevel'].str.title()
wbcountries.loc[wbcountries.iso3c=='VEN', 'incomeLevel'] = 'Upper Middle Income'
Few and varied indicators of interest:
Search for the variable on the WDI Indicator website (as explained above)
(e.g., https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.KD)
Add the indicator's name into a list of indicators in Python
(i.e., everything that comes after https://data.worldbank.org/indicator/. E.g., NY.GDP.PCAP.PP.KD)
wdi_indicators = ['NY.GDP.PCAP.PP.KD', 'NY.GDP.PCAP.KD', 'SL.GDP.PCAP.EM.KD', 'SP.POP.GROW', 'SP.POP.TOTL', 'SP.DYN.WFRT', 'SP.DYN.TFRT.IN']
wdi_indicators = ['NY.GDP.PCAP.PP.KD', 'NY.GDP.PCAP.KD', 'SL.GDP.PCAP.EM.KD', 'SP.POP.GROW', 'SP.POP.TOTL', 'SP.DYN.WFRT', 'SP.DYN.TFRT.IN']
Many related indicators or mass search
(e.g., search for all variables containing the word population)
popvars = wb.search(string='population')
This returns a dataframe, where the column id has the IDs for the indicators
popvars = wb.search(string='population')
popvars.head()
| id | name | unit | source | sourceNote | sourceOrganization | topics | |
|---|---|---|---|---|---|---|---|
| 24 | 1.1_ACCESS.ELECTRICITY.TOT | Access to electricity (% of total population) | Sustainable Energy for All | Access to electricity is the percentage of pop... | b'World Bank Global Electrification Database 2... | ||
| 39 | 1.2_ACCESS.ELECTRICITY.RURAL | Access to electricity (% of rural population) | Sustainable Energy for All | Access to electricity is the percentage of rur... | b'World Bank Global Electrification Database 2... | ||
| 40 | 1.3_ACCESS.ELECTRICITY.URBAN | Access to electricity (% of urban population) | Sustainable Energy for All | Access to electricity is the percentage of tot... | b'World Bank Global Electrification Database 2... | ||
| 164 | 2.1_ACCESS.CFT.TOT | Access to Clean Fuels and Technologies for coo... | Sustainable Energy for All | b'' | |||
| 195 | 3.11.01.01.popcen | Population census | Statistical Capacity Indicators | Population censuses collect data on the size, ... | b'World Bank Microdata library. Original sourc... |
wdi = wb.download(indicator=wdi_indicators,
country=list_of_countries_ISO_A2_codes,
start=start_year,
end=end_year)
wdi = wdi.reset_index()
wdi['year'] = wdi.year.astype(int)
wdi = wb.download(indicator=wdi_indicators, country=wbcountries.iso2c.values, start=1950, end=year)
wdi = wdi.reset_index()
wdi['year'] = wdi.year.astype(int)
wdi['gdp_pc'] = wdi['NY.GDP.PCAP.PP.KD']
wdi['ln_gdp_pc'] = wdi['NY.GDP.PCAP.PP.KD'].apply(np.log)
wdi['ln_pop'] = wdi['SP.POP.TOTL'].apply(np.log)
wdi.head()
/Users/ozak/anaconda3/envs/EconGrowthUG-Builds/lib/python3.9/site-packages/pandas_datareader/wb.py:592: UserWarning: Non-standard ISO country codes: JG, XK
| country | year | NY.GDP.PCAP.PP.KD | NY.GDP.PCAP.KD | SL.GDP.PCAP.EM.KD | SP.POP.GROW | SP.POP.TOTL | SP.DYN.WFRT | SP.DYN.TFRT.IN | gdp_pc | ln_gdp_pc | ln_pop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Aruba | 2020 | 29563.756955 | 23026.332866 | NaN | 0.428017 | 106766.0 | NaN | 1.901 | 29563.756955 | 10.294304 | 11.578395 |
| 1 | Aruba | 2019 | 38221.117314 | 29769.293907 | NaN | 0.437415 | 106310.0 | NaN | 1.901 | 38221.117314 | 10.551143 | 11.574115 |
| 2 | Aruba | 2018 | 39206.356147 | 30536.667193 | NaN | 0.459266 | 105846.0 | NaN | 1.896 | 39206.356147 | 10.576594 | 11.569740 |
| 3 | Aruba | 2017 | 38893.960556 | 30293.351539 | NaN | 0.471874 | 105361.0 | NaN | 1.886 | 38893.960556 | 10.568594 | 11.565148 |
| 4 | Aruba | 2016 | 37046.877414 | 28854.713299 | NaN | 0.502860 | 104865.0 | NaN | 1.872 | 37046.877414 | 10.519939 | 11.560429 |
wbcountries dataframewdi = wbcountries.merge(wdi, left_on='name', right_on='country')
wdi = wbcountries.merge(wdi, left_on='name', right_on='country')
wdi.head()
| iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | ... | NY.GDP.PCAP.PP.KD | NY.GDP.PCAP.KD | SL.GDP.PCAP.EM.KD | SP.POP.GROW | SP.POP.TOTL | SP.DYN.WFRT | SP.DYN.TFRT.IN | gdp_pc | ln_gdp_pc | ln_pop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ABW | AW | Aruba | Latin America & Caribbean | High Income | Not classified | Oranjestad | -70.0167 | 12.5167 | ... | 29563.756955 | 23026.332866 | NaN | 0.428017 | 106766.0 | NaN | 1.901 | 29563.756955 | 10.294304 | 11.578395 | |
| 1 | ABW | AW | Aruba | Latin America & Caribbean | High Income | Not classified | Oranjestad | -70.0167 | 12.5167 | ... | 38221.117314 | 29769.293907 | NaN | 0.437415 | 106310.0 | NaN | 1.901 | 38221.117314 | 10.551143 | 11.574115 | |
| 2 | ABW | AW | Aruba | Latin America & Caribbean | High Income | Not classified | Oranjestad | -70.0167 | 12.5167 | ... | 39206.356147 | 30536.667193 | NaN | 0.459266 | 105846.0 | NaN | 1.896 | 39206.356147 | 10.576594 | 11.569740 | |
| 3 | ABW | AW | Aruba | Latin America & Caribbean | High Income | Not classified | Oranjestad | -70.0167 | 12.5167 | ... | 38893.960556 | 30293.351539 | NaN | 0.471874 | 105361.0 | NaN | 1.886 | 38893.960556 | 10.568594 | 11.565148 | |
| 4 | ABW | AW | Aruba | Latin America & Caribbean | High Income | Not classified | Oranjestad | -70.0167 | 12.5167 | ... | 37046.877414 | 28854.713299 | NaN | 0.502860 | 104865.0 | NaN | 1.872 | 37046.877414 | 10.519939 | 11.560429 |
5 rows × 22 columns
url = 'https://www.statsmodels.org/stable/index.html'
IFrame(url, width=800, height=400)
It is very easy to run a regression in statsmodels.
We only need
Equations are strings of the form
'dependent_variable ~ indep_var_1 + function(indep_var2) + C(indep_var3)'
where:
dependent_variable is the outcome variable of interestindep_var_1 is the first independent variable function(indep_var2) is a function of another independent variable (if needed) C(indep_var3) defines fixed-effects/dummies based on categories given in indep_var3dffig = wdi.loc[wdi.year==year]\
.dropna(subset=['ln_gdp_pc', 'latitude', 'ln_pop'])\
.sort_values(by='region').reset_index()
mod = smf.ols(formula='ln_gdp_pc ~ latitude', data=dffig, missing='drop').fit()
mod.summary2()
| Model: | OLS | Adj. R-squared: | 0.230 |
| Dependent Variable: | ln_gdp_pc | AIC: | 544.5117 |
| Date: | 2022-10-24 10:29 | BIC: | 551.0057 |
| No. Observations: | 190 | Log-Likelihood: | -270.26 |
| Df Model: | 1 | F-statistic: | 57.42 |
| Df Residuals: | 188 | Prob (F-statistic): | 1.55e-12 |
| R-squared: | 0.234 | Scale: | 1.0177 |
| Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 8.9496 | 0.0919 | 97.3483 | 0.0000 | 8.7682 | 9.1309 |
| latitude | 0.0230 | 0.0030 | 7.5777 | 0.0000 | 0.0170 | 0.0290 |
| Omnibus: | 0.797 | Durbin-Watson: | 1.698 |
| Prob(Omnibus): | 0.671 | Jarque-Bera (JB): | 0.853 |
| Skew: | 0.002 | Prob(JB): | 0.653 |
| Kurtosis: | 2.672 | Condition No.: | 38 |
pred_ols = mod.get_prediction()
iv_l = pred_ols.summary_frame()["mean_ci_lower"]
iv_u = pred_ols.summary_frame()["mean_ci_upper"]
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(dffig.latitude, dffig.ln_gdp_pc, "o", label="data")
ax.plot(dffig.latitude, mod.fittedvalues, "r--.", label="OLS")
ax.plot(dffig.latitude, iv_u, "r--")
ax.plot(dffig.latitude, iv_l, "r--")
ax.legend(loc="best")
<matplotlib.legend.Legend at 0x193c14940>
fig
mod2 = smf.ols(formula='ln_gdp_pc ~ latitude + C(region)', data=dffig, missing='drop').fit()
mod2.summary2()
| Model: | OLS | Adj. R-squared: | 0.495 |
| Dependent Variable: | ln_gdp_pc | AIC: | 470.1465 |
| Date: | 2022-10-24 10:29 | BIC: | 496.1227 |
| No. Observations: | 190 | Log-Likelihood: | -227.07 |
| Df Model: | 7 | F-statistic: | 27.47 |
| Df Residuals: | 182 | Prob (F-statistic): | 1.54e-25 |
| R-squared: | 0.514 | Scale: | 0.66723 |
| Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 9.3596 | 0.1482 | 63.1635 | 0.0000 | 9.0673 | 9.6520 |
| C(region)[T.Europe & Central Asia] | 0.8276 | 0.2643 | 3.1314 | 0.0020 | 0.3061 | 1.3490 |
| C(region)[T.Latin America & Caribbean ] | 0.2065 | 0.2016 | 1.0243 | 0.3071 | -0.1913 | 0.6042 |
| C(region)[T.Middle East & North Africa] | 0.5115 | 0.2654 | 1.9273 | 0.0555 | -0.0122 | 1.0352 |
| C(region)[T.North America] | 1.5940 | 0.5155 | 3.0919 | 0.0023 | 0.5768 | 2.6112 |
| C(region)[T.South Asia] | -0.6446 | 0.3334 | -1.9337 | 0.0547 | -1.3024 | 0.0131 |
| C(region)[T.Sub-Saharan Africa ] | -1.2574 | 0.1917 | -6.5578 | 0.0000 | -1.6357 | -0.8791 |
| latitude | 0.0010 | 0.0043 | 0.2268 | 0.8208 | -0.0076 | 0.0095 |
| Omnibus: | 1.017 | Durbin-Watson: | 2.215 |
| Prob(Omnibus): | 0.601 | Jarque-Bera (JB): | 1.096 |
| Skew: | 0.168 | Prob(JB): | 0.578 |
| Kurtosis: | 2.842 | Condition No.: | 286 |
mod3 = smf.ols(formula='ln_gdp_pc ~ np.abs(latitude) + C(region)', data=dffig, missing='drop').fit()
mod3.summary2()
| Model: | OLS | Adj. R-squared: | 0.520 |
| Dependent Variable: | ln_gdp_pc | AIC: | 460.5396 |
| Date: | 2022-10-24 10:29 | BIC: | 486.5158 |
| No. Observations: | 190 | Log-Likelihood: | -222.27 |
| Df Model: | 7 | F-statistic: | 30.25 |
| Df Residuals: | 182 | Prob (F-statistic): | 1.73e-27 |
| R-squared: | 0.538 | Scale: | 0.63434 |
| Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 9.0090 | 0.1838 | 49.0269 | 0.0000 | 8.6464 | 9.3715 |
| C(region)[T.Europe & Central Asia] | 0.2314 | 0.2763 | 0.8376 | 0.4034 | -0.3137 | 0.7766 |
| C(region)[T.Latin America & Caribbean ] | 0.2287 | 0.1965 | 1.1635 | 0.2462 | -0.1591 | 0.6165 |
| C(region)[T.Middle East & North Africa] | 0.2693 | 0.2514 | 1.0712 | 0.2855 | -0.2267 | 0.7654 |
| C(region)[T.North America] | 1.1738 | 0.5036 | 2.3308 | 0.0209 | 0.1801 | 2.1674 |
| C(region)[T.South Asia] | -0.7494 | 0.3183 | -2.3541 | 0.0196 | -1.3775 | -0.1213 |
| C(region)[T.Sub-Saharan Africa ] | -1.1397 | 0.1894 | -6.0178 | 0.0000 | -1.5134 | -0.7660 |
| np.abs(latitude) | 0.0208 | 0.0068 | 3.0811 | 0.0024 | 0.0075 | 0.0341 |
| Omnibus: | 3.219 | Durbin-Watson: | 2.238 |
| Prob(Omnibus): | 0.200 | Jarque-Bera (JB): | 2.974 |
| Skew: | 0.305 | Prob(JB): | 0.226 |
| Kurtosis: | 3.064 | Condition No.: | 283 |
mod4 = smf.ols(formula='ln_gdp_pc ~ np.log(np.abs(latitude)) + C(region)', data=dffig, missing='drop').fit()
mod4.summary2()
| Model: | OLS | Adj. R-squared: | 0.497 |
| Dependent Variable: | ln_gdp_pc | AIC: | 469.4181 |
| Date: | 2022-10-24 10:29 | BIC: | 495.3943 |
| No. Observations: | 190 | Log-Likelihood: | -226.71 |
| Df Model: | 7 | F-statistic: | 27.68 |
| Df Residuals: | 182 | Prob (F-statistic): | 1.09e-25 |
| R-squared: | 0.516 | Scale: | 0.66468 |
| Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 9.2090 | 0.2315 | 39.7749 | 0.0000 | 8.7522 | 9.6658 |
| C(region)[T.Europe & Central Asia] | 0.7798 | 0.2140 | 3.6434 | 0.0004 | 0.3575 | 1.2020 |
| C(region)[T.Latin America & Caribbean ] | 0.1984 | 0.2014 | 0.9851 | 0.3259 | -0.1990 | 0.5957 |
| C(region)[T.Middle East & North Africa] | 0.4772 | 0.2510 | 1.9012 | 0.0589 | -0.0180 | 0.9725 |
| C(region)[T.North America] | 1.5506 | 0.5009 | 3.0956 | 0.0023 | 0.5622 | 2.5389 |
| C(region)[T.South Asia] | -0.6582 | 0.3253 | -2.0231 | 0.0445 | -1.3001 | -0.0163 |
| C(region)[T.Sub-Saharan Africa ] | -1.2359 | 0.1921 | -6.4327 | 0.0000 | -1.6150 | -0.8568 |
| np.log(np.abs(latitude)) | 0.0636 | 0.0735 | 0.8664 | 0.3874 | -0.0813 | 0.2086 |
| Omnibus: | 1.172 | Durbin-Watson: | 2.224 |
| Prob(Omnibus): | 0.557 | Jarque-Bera (JB): | 1.216 |
| Skew: | 0.185 | Prob(JB): | 0.544 |
| Kurtosis: | 2.871 | Condition No.: | 29 |
url = 'https://nbviewer.org/github/mwburke/stargazer/blob/master/examples.ipynb'
IFrame(url, width=800, height=400)
stargazer = Stargazer([mod, mod2, mod3, mod4])
stargazer.significant_digits(2)
stargazer.show_degrees_of_freedom(False)
#stargazer.dep_var_name = ''
stargazer.dependent_variable = ' Log[GDP per capita (' + str(year) + ')]'
stargazer.custom_columns(['Latitude', 'Abs(Latitude)', 'Log[Abs(Latitude)]'], [2, 1, 1])
#stargazer.show_model_numbers(False)
stargazer.rename_covariates({'latitude':'Latitude',
'np.abs(latitude)':'Absolute Latitude',
'np.log(np.abs(latitude))':'Log[Absolute Latitude]',})
stargazer.add_line('WB Region FE', ['No', 'Yes', 'Yes', 'Yes'], LineLocation.FOOTER_TOP)
stargazer.covariate_order(['latitude', 'np.abs(latitude)', 'np.log(np.abs(latitude))'])
stargazer.cov_spacing = 2
stargazer
| Dependent variable: Log[GDP per capita (2020)] | ||||
| Latitude | Abs(Latitude) | Log[Abs(Latitude)] | ||
| (1) | (2) | (3) | (4) | |
| Latitude | 0.02*** | 0.00 | ||
| (0.00) | (0.00) | |||
| Absolute Latitude | 0.02*** | |||
| (0.01) | ||||
| Log[Absolute Latitude] | 0.06 | |||
| (0.07) | ||||
| WB Region FE | No | Yes | Yes | Yes |
| Observations | 190 | 190 | 190 | 190 |
| R2 | 0.23 | 0.51 | 0.54 | 0.52 |
| Adjusted R2 | 0.23 | 0.50 | 0.52 | 0.50 |
| Residual Std. Error | 1.01 | 0.82 | 0.80 | 0.82 |
| F Statistic | 57.42*** | 27.47*** | 30.25*** | 27.68*** |
| Note: | *p<0.1; **p<0.05; ***p<0.01 | |||
HTML(stargazer.render_html())
HTML(stargazer.render_html())
| Dependent variable: Log[GDP per capita (2020)] | ||||
| Latitude | Abs(Latitude) | Log[Abs(Latitude)] | ||
| (1) | (2) | (3) | (4) | |
| Latitude | 0.02*** | 0.00 | ||
| (0.00) | (0.00) | |||
| Absolute Latitude | 0.02*** | |||
| (0.01) | ||||
| Log[Absolute Latitude] | 0.06 | |||
| (0.07) | ||||
| WB Region FE | No | Yes | Yes | Yes |
| Observations | 190 | 190 | 190 | 190 |
| R2 | 0.23 | 0.51 | 0.54 | 0.52 |
| Adjusted R2 | 0.23 | 0.50 | 0.52 | 0.50 |
| Residual Std. Error | 1.01 | 0.82 | 0.80 | 0.82 |
| F Statistic | 57.42*** | 27.47*** | 30.25*** | 27.68*** |
| Note: | *p<0.1; **p<0.05; ***p<0.01 | |||
file_name = "table.html" #Include directory path if needed
html_file = open(pathgraphs + file_name, "w" ) #This will overwrite an existing file
html_file.write( stargazer.render_html() )
html_file.close()
url = pathgraphs + 'table.html'
url = 'https://smu-econ-growth.github.io/EconGrowthUG-Slides-Working-with-WDI/table.html'
IFrame(url, width=500, height=300)
url = 'https://seaborn.pydata.org/examples/index.html'
IFrame(url, width=800, height=400)
Let's create a Scatterplot with varying point sizes and hues that plots the latitude and Log[GDP per capita] of each country and uses its log-population and the WB region in the last available year as the size and hue.
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
g = sns.relplot(x="latitude",
y="ln_gdp_pc",
data=dffig,
hue="region",
hue_order = dffig.region.drop_duplicates().sort_values(),
style="region",
style_order = dffig.region.drop_duplicates().sort_values(),
size="ln_pop",
sizes=(10, 400),
alpha=.5,
height=6,
aspect=2,
palette="muted",
)
g.set_axis_labels('Latitude', 'Log[GDP per capita (' + str(year) + ')]')
<seaborn.axisgrid.FacetGrid at 0x1973aa880>
g.fig
scatterplot¶sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
fig, ax = plt.subplots()
sns.scatterplot(x="latitude",
y="ln_gdp_pc",
data=dffig,
hue="region",
hue_order = dffig.region.drop_duplicates().sort_values(),
style="region",
style_order = dffig.region.drop_duplicates().sort_values(),
size="ln_pop",
sizes=(10, 400),
alpha=.5,
palette="muted",
ax=ax
)
ax.set_xlabel('Latitude')
ax.set_ylabel('Log[GDP per capita (' + str(year) + ')]')
ax.legend(fontsize=10)
<matplotlib.legend.Legend at 0x1943c5ca0>
fig
def my_xy_plot(dfin,
x='SP.POP.GROW',
y='ln_gdp_pc',
labelvar='iso3c',
dx=0.006125,
dy=0.006125,
xlogscale=False,
ylogscale=False,
xlabel='Growth Rate of Population',
ylabel='Log[Income per capita in ' + str(year) + ']',
labels=False,
xpct = False,
ypct = False,
OLS=False,
OLSlinelabel='OLS',
ssline=False,
sslinelabel='45 Degree Line',
filename='income-pop-growth.pdf',
hue='region',
hue_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
style='incomeLevel',
style_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=None,
size=None,
sizes=None,
legend_fontsize=10,
label_font_size=12,
save=True):
'''
Plot the association between x and var in dataframe using labelvar for labels.
'''
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
df = dfin.copy()
df = df.dropna(subset=[x, y]).reset_index(drop=True)
# Plot
k = 0
fig, ax = plt.subplots()
sns.scatterplot(x=x, y=y, data=df, ax=ax,
hue=hue,
hue_order=hue_order,
#hue='incomeLevel',
#hue_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
#hue_order=['East Asia & Pacific', 'Europe & Central Asia',
# 'Latin America & Caribbean ', 'Middle East & North Africa',
# 'North America', 'South Asia', 'Sub-Saharan Africa '],
alpha=1,
style=style,
style_order=style_order,
palette=palette,
size=size,
sizes=sizes,
#palette=sns.color_palette("Blues_r", df[hue].unique().shape[0]+6)[:df[hue].unique().shape[0]*2:2],
)
if OLS:
sns.regplot(x=x, y=y, data=df, ax=ax, label=OLSlinelabel, scatter=False)
if ssline:
ax.plot([df[x].min()*.99, df[x].max()*1.01], [df[x].min()*.99, df[x].max()*1.01], c='r', label=sslinelabel)
if labels:
movex = df[x].mean() * dx
movey = df[y].mean() * dy
for line in range(0,df.shape[0]):
ax.text(df[x][line]+movex, df[y][line]+movey, df[labelvar][line], horizontalalignment='left', fontsize=label_font_size, color='black')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
if xpct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
xticks = mtick.FormatStrFormatter(fmt)
ax.xaxis.set_major_formatter(xticks)
if ypct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt)
ax.yaxis.set_major_formatter(yticks)
if ylogscale:
ax.set(yscale="log")
if xlogscale:
ax.set(xscale="log")
handles, labels = ax.get_legend_handles_labels()
handles = np.array(handles)
labels = np.array(labels)
handles = list(handles[(labels!=hue) & (labels!=style) & (labels!=size)])
labels = list(labels[(labels!=hue) & (labels!=style) & (labels!=size)])
ax.legend(handles=handles, labels=labels, fontsize=legend_fontsize)
if save:
plt.savefig(pathgraphs + filename, dpi=300, bbox_inches='tight')
return fig
g = my_xy_plot(dffig,
x='latitude',
y='ln_gdp_pc',
xlabel='Latitude',
ylabel='Log[GDP per capita (' + str(year) +')]',
OLS=True,
labels=True,
#size="ln_pop",
#sizes=(10, 400),
filename='ln-gdp-pc-latitude.pdf')
g
def my_xy_line_plot(dfin,
x='year',
y='ln_gdp_pc',
labelvar='iso3c',
dx=0.006125,
dy=0.006125,
xlogscale=False,
ylogscale=False,
xlabel='Growth Rate of Population',
ylabel='Log[Income per capita in ' + str(year) + ']',
labels=False,
xpct = False,
ypct = False,
OLS=False,
OLSlinelabel='OLS',
ssline=False,
sslinelabel='45 Degree Line',
filename='income-pop-growth.pdf',
hue='region',
hue_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
style='incomeLevel',
style_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=None,
legend_fontsize=10,
label_fontsize=12,
loc=None,
save=True):
'''
Plot the association between x and var in dataframe using labelvar for labels.
'''
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
df = dfin.copy()
df = df.dropna(subset=[x, y]).reset_index(drop=True)
# Plot
k = 0
fig, ax = plt.subplots()
sns.lineplot(x=x, y=y, data=df, ax=ax,
hue=hue,
hue_order=hue_order,
alpha=1,
style=style,
style_order=style_order,
palette=palette,
)
if OLS:
sns.regplot(x=x, y=y, data=df, ax=ax, label=OLSlinelabel, scatter=False)
if ssline:
ax.plot([df[x].min()*.99, df[x].max()*1.01], [df[x].min()*.99, df[x].max()*1.01], c='r', label=sslinelabel)
if labels:
movex = df[x].mean() * dx
movey = df[y].mean() * dy
for line in range(0,df.shape[0]):
ax.text(df[x][line]+movex, df[y][line]+movey, df[labelvar][line], horizontalalignment='left', fontsize=label_fontsize, color='black')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
if xpct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
xticks = mtick.FormatStrFormatter(fmt)
ax.xaxis.set_major_formatter(xticks)
if ypct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt)
ax.yaxis.set_major_formatter(yticks)
if ylogscale:
ax.set(yscale="log")
if xlogscale:
ax.set(xscale="log")
handles, labels = ax.get_legend_handles_labels()
handles = np.array(handles)
labels = np.array(labels)
handles = list(handles[(labels!='region') & (labels!='incomeLevel')])
labels = list(labels[(labels!='region') & (labels!='incomeLevel')])
ax.legend(handles=handles, labels=labels, fontsize=legend_fontsize, loc=loc)
if save:
plt.savefig(pathgraphs + filename, dpi=300, bbox_inches='tight')
return fig
palette=sns.color_palette("Blues_r", wdi['incomeLevel'].unique().shape[0]+6)[:wdi['incomeLevel'].unique().shape[0]*2:2]
fig = my_xy_line_plot(wdi,
x='year',
y='ln_gdp_pc',
xlabel='Year',
ylabel='Log[GDP per capita]',
filename='ln-gdp-pc-income-groups-TS.pdf',
hue='incomeLevel',
hue_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=palette,
OLS=False,
labels=False,
legend_fontsize=16,
loc='lower right',
save=True)
fig
#palette=sns.color_palette("Blues_r", wdi['region'].unique().shape[0]+6)[:wdi['region'].unique().shape[0]*2:2]
fig = my_xy_line_plot(wdi,
x='year',
y='gdp_pc',
xlabel='Year',
ylabel='GDP per capita',
ylogscale=True,
filename='ln-gdp-pc-regions-TS.pdf',
style='region',
style_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
#palette=palette,
OLS=False,
labels=False,
legend_fontsize=12,
loc='lower right',
save=True)
fig
url = 'https://plotly.com/python/'
IFrame(url, width=800, height=400)
symbols = ['circle', 'x', 'square', 'cross', 'diamond', 'star-diamond', 'triangle-up']
fig = px.scatter(dffig,
x="latitude",
y="ln_gdp_pc",
color='region',
symbol='region',
symbol_sequence=symbols,
hover_name='name',
hover_data=['iso3c', 'ln_pop', 'gdp_pc'],
size='ln_pop',
size_max=15,
trendline="ols",
trendline_scope="overall",
trendline_color_override="black",
labels={
"latitude": "Latitude",
"ln_gdp_pc": "Log[GDP per capita (" + str(year) + ")]",
"gdp_pc": "GDP per capita (" + str(year) + ")",
"region": "WB Region"
},
opacity=0.75,
height=800,
)
fig.show()
fig.update_traces(marker=dict(#size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
tr_line=[]
for k, trace in enumerate(fig.data):
if trace.mode is not None and trace.mode == 'lines':
tr_line.append(k)
print(tr_line)
for id in tr_line:
fig.data[id].update(line_width=3)
[7]
fig.show()
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01
))
fig.show()
fig.write_image(pathgraphs + "fig1.pdf")
fig.write_image(pathgraphs + "fig1.png")
fig.write_image(pathgraphs + "fig1.jpg")
fig.write_image(pathgraphs + "ln-gdp-pc-latitude-plotly.pdf", height=1000, width=1500, scale=4)
results = px.get_trendline_results(fig)
results.px_fit_results.iloc[0].summary()
| Dep. Variable: | y | R-squared: | 0.234 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.230 |
| Method: | Least Squares | F-statistic: | 57.42 |
| Date: | Mon, 24 Oct 2022 | Prob (F-statistic): | 1.55e-12 |
| Time: | 10:29:45 | Log-Likelihood: | -270.26 |
| No. Observations: | 190 | AIC: | 544.5 |
| Df Residuals: | 188 | BIC: | 551.0 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 8.9496 | 0.092 | 97.348 | 0.000 | 8.768 | 9.131 |
| x1 | 0.0230 | 0.003 | 7.578 | 0.000 | 0.017 | 0.029 |
| Omnibus: | 0.797 | Durbin-Watson: | 1.678 |
|---|---|---|---|
| Prob(Omnibus): | 0.671 | Jarque-Bera (JB): | 0.853 |
| Skew: | 0.002 | Prob(JB): | 0.653 |
| Kurtosis: | 2.672 | Cond. No. | 38.1 |
There are various types of data in Geographic Information Systems (GIS)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip'
r = requests.get(url, headers=headers)
countries = gp.read_file(io.BytesIO(r.content))
fig, ax = plt.subplots(figsize=(15,10))
countries.plot(ax=ax)
ax.set_title("WGS84 (lat/lon)", fontdict={'fontsize':34})
Text(0.5, 1.0, 'WGS84 (lat/lon)')
dffig2 = countries.merge(dffig, left_on='ADM0_A3', right_on='iso3c')
fig, ax = plt.subplots(figsize=(15,10))
dffig2.plot(column='gdp_pc', ax=ax, cmap='Reds')
ax.set_title("WGS84 (lat/lon)", fontdict={'fontsize':34})
Text(0.5, 1.0, 'WGS84 (lat/lon)')
url = 'https://residentmario.github.io/geoplot/'
IFrame(url, width=800, height=400)
gplt.polyplot(
countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', facecolor='lightgray',
rasterized=True,
extent=[-180, -90, 180, 90],
)
<GeoAxesSubplot:>
gplt.choropleth(dffig2, hue='gdp_pc',
projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white',
linewidth=1,
cmap='Reds', legend=True,
scheme='FisherJenks',
legend_kwargs={'bbox_to_anchor':(0.3, 0.5),
'frameon': True,
'title':'GDP per capita',
},
figsize=(12,8),
rasterized=True,
)
<GeoAxesSubplot:>
ax = gplt.choropleth(dffig2, hue='gdp_pc', projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', linewidth=1,
cmap='Reds', legend=True,
scheme='FisherJenks',
legend_kwargs={'bbox_to_anchor':(0.3, 0.5),
'frameon': True,
'title':'GDP per capita',
},
figsize=(15,10),
rasterized=True,
)
gplt.polyplot(countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', facecolor='lightgray',
ax=ax,
rasterized=True,
extent=[-180, -90, 180, 90],
)
<GeoAxesSubplot:>
# Functions for plotting
def center_wrap(text, cwidth=32, **kw):
'''Center Text (to be used in legend)'''
lines = text
#lines = textwrap.wrap(text, **kw)
return "\n".join(line.center(cwidth) for line in lines)
def MyChloropleth(mydf, myfile='fig', myvar='gdp_pc',
mylegend='GDP per capita',
k=5,
extent=[-180, -90, 180, 90],
bbox_to_anchor=(0.2, 0.5),
edgecolor='white', facecolor='lightgray',
scheme='FisherJenks',
save=True,
percent=False,
**kwargs):
# Chloropleth
# Color scheme
if scheme=='EqualInterval':
scheme = mc.EqualInterval(mydf[myvar], k=k)
elif scheme=='Quantiles':
scheme = mc.Quantiles(mydf[myvar], k=k)
elif scheme=='BoxPlot':
scheme = mc.BoxPlot(mydf[myvar], k=k)
elif scheme=='FisherJenks':
scheme = mc.FisherJenks(mydf[myvar], k=k)
elif scheme=='FisherJenksSampled':
scheme = mc.FisherJenksSampled(mydf[myvar], k=k)
elif scheme=='HeadTailBreaks':
scheme = mc.HeadTailBreaks(mydf[myvar], k=k)
elif scheme=='JenksCaspall':
scheme = mc.JenksCaspall(mydf[myvar], k=k)
elif scheme=='JenksCaspallForced':
scheme = mc.JenksCaspallForced(mydf[myvar], k=k)
elif scheme=='JenksCaspallSampled':
scheme = mc.JenksCaspallSampled(mydf[myvar], k=k)
elif scheme=='KClassifiers':
scheme = mc.KClassifiers(mydf[myvar], k=k)
# Format legend
upper_bounds = scheme.bins
# get and format all bounds
bounds = []
for index, upper_bound in enumerate(upper_bounds):
if index == 0:
lower_bound = mydf[myvar].min()
else:
lower_bound = upper_bounds[index-1]
# format the numerical legend here
if percent:
bound = f'{lower_bound:.0%} - {upper_bound:.0%}'
else:
bound = f'{float(lower_bound):,.0f} - {float(upper_bound):,.0f}'
bounds.append(bound)
legend_labels = bounds
#Plot
ax = gplt.choropleth(
mydf, hue=myvar, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', linewidth=1,
cmap='Reds', legend=True,
scheme=scheme,
legend_kwargs={'bbox_to_anchor': bbox_to_anchor,
'frameon': True,
'title':mylegend,
},
legend_labels = legend_labels,
figsize=(24, 16),
rasterized=True,
)
gplt.polyplot(
countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor=edgecolor, facecolor=facecolor,
ax=ax,
rasterized=True,
extent=extent,
)
if save:
plt.savefig(pathgraphs + myfile + '.pdf', dpi=300, bbox_inches='tight')
plt.savefig(pathgraphs + myfile + '.png', dpi=300, bbox_inches='tight')
pass
mylegend = center_wrap(["GDP per capita in " + str(year)], cwidth=32, width=32)
MyChloropleth(dffig2, myfile='fig-gdp-pc-' + str(year), myvar='gdp_pc', mylegend=mylegend, k=10, scheme='Quantiles', save=True)
url = 'https://plotly.com/python/maps/'
IFrame(url, width=800, height=400)
scheme = mc.Quantiles(dffig2['gdp_pc'], k=5)
classifier = mc.Quantiles.make(k=5, rolling=True)
dffig2['gdp_pc_q'] = classifier(dffig2['gdp_pc'])
dffig2['gdp_pc_qc'] = dffig2['gdp_pc_q'].apply(lambda x: scheme.get_legend_classes()[x].replace('[ ', '[').replace('( ', '('))
fig = px.choropleth(dffig2.sort_values('gdp_pc_q', ascending=True),
locations="iso3c",
color="gdp_pc_qc",
hover_name='name',
hover_data=['iso3c', 'ln_pop'],
labels={
"gdp_pc_qc": "GDP per capita (" + str(year) + ")",
},
color_discrete_sequence=px.colors.sequential.Reds,
height=600,
width=1000,
)
# Change legend position
fig.update_layout(legend=dict(
yanchor="bottom",
y=0.15,
xanchor="left",
x=0.05
))
fig.show()
fig = px.choropleth(dffig2.sort_values('gdp_pc_q', ascending=True),
locations="iso3c",
color="gdp_pc_qc",
hover_name='name',
hover_data=['iso3c', 'gdp_pc' ,'ln_pop'],
labels={
"gdp_pc_qc": "GDP per capita (" + str(year) + ")",
"gdp_pc": "GDP per capita (" + str(year) + ")",
'iso3c':'ISO code',
"ln_pop": "Log[Population (" + str(year) + ")]",
},
color_discrete_sequence=px.colors.sequential.Blues,
height=600,
width=1000,
)
# Change legend position
fig.update_layout(legend=dict(
yanchor="bottom",
y=0.15,
xanchor="left",
x=0.05
))
fig.show()
fig = px.choropleth(dffig,
locations="iso3c",
color="ln_gdp_pc",
hover_name='name',
hover_data=['iso3c', 'ln_pop'],
labels={
"ln_gdp_pc": "Log[GDP per capita (" + str(year) + ")]",
},
#color_continuous_scale=px.colors.sequential.Plasma,
color_continuous_scale="Reds",
height=600,
width=1100,
)
fig.show()
fig.update_layout(coloraxis_colorbar=dict(
orientation = 'h',
yanchor="bottom",
xanchor="left",
y=-.2,
x=0,
))
fig.update_coloraxes(colorbar_title_side='top')
fig.show()
# Change legend position
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="center",
x=0.01,
orientation='h',
))
fig.show()
fig = go.Figure(data=go.Choropleth(
locations = dffig['iso3c'],
z = dffig['gdp_pc'],
text = dffig['name'],
colorscale = 'Blues',
autocolorscale=False,
reversescale=True,
marker_line_color='darkgray',
marker_line_width=0.5,
colorbar_tickprefix = '$',
colorbar_title = 'GDP pc',
)
)
fig.update_layout(
autosize=False,
width=800,
height=400,
margin=dict(
l=5,
r=5,
b=10,
t=10,
pad=1
),
paper_bgcolor="LightSteelBlue",
)
fig.show()
fig = go.Figure(data=go.Choropleth(
locations = dffig['iso3c'],
z = dffig['gdp_pc'],
text = dffig['name'],
colorscale = 'Blues',
autocolorscale=False,
reversescale=True,
marker_line_color='darkgray',
marker_line_width=0.5,
colorbar_tickprefix = '$',
colorbar_title = 'GDP per capita',
)
)
fig.update_layout(
autosize=False,
width=1000,
height=600,
margin=dict(
l=1,
r=1,
b=1,
t=1,
pad=.1
),
paper_bgcolor="LightSteelBlue",
)
# Change legend position
cb = fig.data[0].colorbar
cb.orientation = 'h'
cb.yanchor = 'bottom'
cb.xanchor = 'center'
cb.y = .1
cb.title.side = 'top'
fig.show()
from pandas_datareader import wb, data
import pandas as pd
wbcountries = wb.get_countries()
wbcountries = wbcountries[wbcountries['region'] != "Aggregates"]
wbcountries = wbcountries[wbcountries['iso3c'].notnull()]
wbcountries
| iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ABW | AW | Aruba | Latin America & Caribbean | High income | Not classified | Oranjestad | -70.0167 | 12.51670 | |
| 2 | AFG | AF | Afghanistan | South Asia | South Asia | Low income | IDA | Kabul | 69.1761 | 34.52280 |
| 5 | AGO | AO | Angola | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IBRD | Luanda | 13.2420 | -8.81155 |
| 6 | ALB | AL | Albania | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper middle income | IBRD | Tirane | 19.8172 | 41.33170 |
| 7 | AND | AD | Andorra | Europe & Central Asia | High income | Not classified | Andorra la Vella | 1.5218 | 42.50750 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 293 | XKX | XK | Kosovo | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper middle income | IDA | Pristina | 20.9260 | 42.56500 |
| 295 | YEM | YE | Yemen, Rep. | Middle East & North Africa | Middle East & North Africa (excluding high inc... | Low income | IDA | Sana'a | 44.2075 | 15.35200 |
| 296 | ZAF | ZA | South Africa | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Upper middle income | IBRD | Pretoria | 28.1871 | -25.74600 |
| 297 | ZMB | ZM | Zambia | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Low income | IDA | Lusaka | 28.2937 | -15.39820 |
| 298 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 |
218 rows × 10 columns
# IP.PAT.NRES
non_residents = wb.download(indicator="IP.PAT.NRES", country=wbcountries.iso2c.values, start=1950, end=year)
non_residents = non_residents[non_residents['IP.PAT.NRES'].notnull()]
non_residents = non_residents.reset_index()
residents = wb.download(indicator="IP.PAT.RESD", country=wbcountries.iso2c.values, start=1950, end=year)
residents = residents[residents['IP.PAT.RESD'].notnull()]
residents = residents.reset_index()
residents
/opt/anaconda3/lib/python3.9/site-packages/pandas_datareader/wb.py:592: UserWarning: Non-standard ISO country codes: JG, XK /opt/anaconda3/lib/python3.9/site-packages/pandas_datareader/wb.py:592: UserWarning: Non-standard ISO country codes: JG, XK
| country | year | IP.PAT.RESD | |
|---|---|---|---|
| 0 | Aruba | 2002 | 1.0 |
| 1 | Angola | 2020 | 85.0 |
| 2 | Angola | 2019 | 2.0 |
| 3 | Angola | 2018 | 6.0 |
| 4 | Angola | 1992 | 4.0 |
| ... | ... | ... | ... |
| 3727 | Zimbabwe | 1984 | 34.0 |
| 3728 | Zimbabwe | 1983 | 40.0 |
| 3729 | Zimbabwe | 1982 | 41.0 |
| 3730 | Zimbabwe | 1981 | 35.0 |
| 3731 | Zimbabwe | 1980 | 39.0 |
3732 rows × 3 columns
# IP.PAT.RESD
patents = residents.merge(non_residents, on=['country', 'year'])
patents = patents.reset_index()
patents['total_patents'] = patents['IP.PAT.RESD']+ patents['IP.PAT.NRES']
patents
| index | country | year | IP.PAT.RESD | IP.PAT.NRES | total_patents | |
|---|---|---|---|---|---|---|
| 0 | 0 | Angola | 2019 | 2.0 | 108.0 | 110.0 |
| 1 | 1 | Angola | 2018 | 6.0 | 114.0 | 120.0 |
| 2 | 2 | Angola | 1992 | 4.0 | 2.0 | 6.0 |
| 3 | 3 | Albania | 2019 | 4.0 | 1.0 | 5.0 |
| 4 | 4 | Albania | 2018 | 15.0 | 3.0 | 18.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 3668 | 3668 | Zimbabwe | 1984 | 34.0 | 191.0 | 225.0 |
| 3669 | 3669 | Zimbabwe | 1983 | 40.0 | 237.0 | 277.0 |
| 3670 | 3670 | Zimbabwe | 1982 | 41.0 | 230.0 | 271.0 |
| 3671 | 3671 | Zimbabwe | 1981 | 35.0 | 274.0 | 309.0 |
| 3672 | 3672 | Zimbabwe | 1980 | 39.0 | 281.0 | 320.0 |
3673 rows × 6 columns
df = wbcountries.merge(patents, left_on='name', right_on='country')
df
| iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | index | country | year | IP.PAT.RESD | IP.PAT.NRES | total_patents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AGO | AO | Angola | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IBRD | Luanda | 13.2420 | -8.81155 | 0 | Angola | 2019 | 2.0 | 108.0 | 110.0 |
| 1 | AGO | AO | Angola | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IBRD | Luanda | 13.2420 | -8.81155 | 1 | Angola | 2018 | 6.0 | 114.0 | 120.0 |
| 2 | AGO | AO | Angola | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IBRD | Luanda | 13.2420 | -8.81155 | 2 | Angola | 1992 | 4.0 | 2.0 | 6.0 |
| 3 | ALB | AL | Albania | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper middle income | IBRD | Tirane | 19.8172 | 41.33170 | 3 | Albania | 2019 | 4.0 | 1.0 | 5.0 |
| 4 | ALB | AL | Albania | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper middle income | IBRD | Tirane | 19.8172 | 41.33170 | 4 | Albania | 2018 | 15.0 | 3.0 | 18.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3668 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | 3668 | Zimbabwe | 1984 | 34.0 | 191.0 | 225.0 |
| 3669 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | 3669 | Zimbabwe | 1983 | 40.0 | 237.0 | 277.0 |
| 3670 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | 3670 | Zimbabwe | 1982 | 41.0 | 230.0 | 271.0 |
| 3671 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | 3671 | Zimbabwe | 1981 | 35.0 | 274.0 | 309.0 |
| 3672 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | 3672 | Zimbabwe | 1980 | 39.0 | 281.0 | 320.0 |
3673 rows × 16 columns
wdi_indicators = ['NY.GDP.PCAP.PP.KD', 'NY.GDP.PCAP.KD', 'SL.GDP.PCAP.EM.KD', 'SP.POP.GROW', 'SP.POP.TOTL', 'SP.DYN.WFRT', 'SP.DYN.TFRT.IN']
wdi = wb.download(indicator = wdi_indicators,
country=wbcountries.iso2c.values, start=1950, end=year )
wdi
/opt/anaconda3/lib/python3.9/site-packages/pandas_datareader/wb.py:592: UserWarning: Non-standard ISO country codes: JG, XK
| NY.GDP.PCAP.PP.KD | NY.GDP.PCAP.KD | SL.GDP.PCAP.EM.KD | SP.POP.GROW | SP.POP.TOTL | SP.DYN.WFRT | SP.DYN.TFRT.IN | ||
|---|---|---|---|---|---|---|---|---|
| country | year | |||||||
| Aruba | 2020 | 29563.756955 | 23026.332866 | NaN | 0.428017 | 106766.0 | NaN | 1.901 |
| 2019 | 38221.117314 | 29769.293907 | NaN | 0.437415 | 106310.0 | NaN | 1.901 | |
| 2018 | 39206.356147 | 30536.667193 | NaN | 0.459266 | 105846.0 | NaN | 1.896 | |
| 2017 | 38893.960556 | 30293.351539 | NaN | 0.471874 | 105361.0 | NaN | 1.886 | |
| 2016 | 37046.877414 | 28854.713299 | NaN | 0.502860 | 104865.0 | NaN | 1.872 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Zimbabwe | 1964 | NaN | 1152.997692 | NaN | 3.390942 | 4322854.0 | NaN | 7.347 |
| 1963 | NaN | 1206.107233 | NaN | 3.395753 | 4178726.0 | NaN | 7.311 | |
| 1962 | NaN | 1174.431444 | NaN | 3.378137 | 4039209.0 | NaN | 7.267 | |
| 1961 | NaN | 1197.603795 | NaN | 3.342246 | 3905038.0 | NaN | 7.215 | |
| 1960 | NaN | 1164.740250 | NaN | NaN | 3776679.0 | NaN | 7.158 |
13237 rows × 7 columns
wdi = wdi.reset_index()
df = df.merge(wdi, left_on=['country', 'year'], right_on=['country', 'year'])
df
| iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | ... | IP.PAT.RESD | IP.PAT.NRES | total_patents | NY.GDP.PCAP.PP.KD | NY.GDP.PCAP.KD | SL.GDP.PCAP.EM.KD | SP.POP.GROW | SP.POP.TOTL | SP.DYN.WFRT | SP.DYN.TFRT.IN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AGO | AO | Angola | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IBRD | Luanda | 13.2420 | -8.81155 | ... | 2.0 | 108.0 | 110.0 | 6712.021615 | 2612.347027 | 17590.916651 | 3.242914 | 31825299.0 | NaN | 5.442 |
| 1 | AGO | AO | Angola | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IBRD | Luanda | 13.2420 | -8.81155 | ... | 6.0 | 114.0 | 120.0 | 6982.129420 | 2717.474121 | 18353.872450 | 3.276145 | 30809787.0 | NaN | 5.519 |
| 2 | AGO | AO | Angola | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IBRD | Luanda | 13.2420 | -8.81155 | ... | 4.0 | 2.0 | 6.0 | 5126.464246 | 1995.241434 | 13224.961921 | 3.280272 | 12657361.0 | NaN | 7.138 |
| 3 | ALB | AL | Albania | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper middle income | IBRD | Tirane | 19.8172 | 41.33170 | ... | 4.0 | 1.0 | 5.0 | 13653.201570 | 4543.386520 | 30958.219895 | -0.426007 | 2854191.0 | NaN | 1.597 |
| 4 | ALB | AL | Albania | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper middle income | IBRD | Tirane | 19.8172 | 41.33170 | ... | 15.0 | 3.0 | 18.0 | 13317.092313 | 4431.539181 | 31103.772527 | -0.246732 | 2866376.0 | 1.6 | 1.617 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3668 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | ... | 34.0 | 191.0 | 225.0 | NaN | 1479.935680 | NaN | 3.657575 | 8562259.0 | NaN | 6.034 |
| 3669 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | ... | 40.0 | 237.0 | 277.0 | NaN | 1564.916121 | NaN | 3.658056 | 8254746.0 | NaN | 6.240 |
| 3670 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | ... | 41.0 | 230.0 | 271.0 | NaN | 1597.890117 | NaN | 3.616362 | 7958239.0 | NaN | 6.451 |
| 3671 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | ... | 35.0 | 274.0 | 309.0 | NaN | 1614.210098 | NaN | 3.539858 | 7675582.0 | NaN | 6.663 |
| 3672 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 | ... | 39.0 | 281.0 | 320.0 | NaN | 1486.218998 | NaN | 3.413262 | 7408630.0 | NaN | 6.865 |
3673 rows × 23 columns
my_xy_plot function plot the relation between GDP per capita and total patents in the years 1990, 1995, 2000, 2010, 2020.
def my_xy_plot(dfin,
x='SP.POP.GROW',
y='ln_gdp_pc',
labelvar='iso3c',
dx=0.006125,
dy=0.006125,
xlogscale=False,
ylogscale=False,
xlabel='Growth Rate of Population',
ylabel='Log[Income per capita in ' + str(year) + ']',
labels=False,
xpct = False,
ypct = False,
OLS=False,
OLSlinelabel='OLS',
ssline=False,
sslinelabel='45 Degree Line',
filename='income-pop-growth.pdf',
hue='region',
hue_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
style='incomeLevel',
style_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=None,
size=None,
sizes=None,
legend_fontsize=10,
label_font_size=12,
save=True):
'''
Plot the association between x and var in dataframe using labelvar for labels.
'''
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
df = dfin.copy()
df = df.dropna(subset=[x, y]).reset_index(drop=True)
# Plot
k = 0
fig, ax = plt.subplots()
sns.scatterplot(x=x, y=y, data=df, ax=ax,
hue=hue,
hue_order=hue_order,
#hue='incomeLevel',
#hue_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
#hue_order=['East Asia & Pacific', 'Europe & Central Asia',
# 'Latin America & Caribbean ', 'Middle East & North Africa',
# 'North America', 'South Asia', 'Sub-Saharan Africa '],
alpha=1,
style=style,
style_order=style_order,
palette=palette,
size=size,
sizes=sizes,
#palette=sns.color_palette("Blues_r", df[hue].unique().shape[0]+6)[:df[hue].unique().shape[0]*2:2],
)
if OLS:
sns.regplot(x=x, y=y, data=df, ax=ax, label=OLSlinelabel, scatter=False)
if ssline:
ax.plot([df[x].min()*.99, df[x].max()*1.01], [df[x].min()*.99, df[x].max()*1.01], c='r', label=sslinelabel)
if labels:
movex = df[x].mean() * dx
movey = df[y].mean() * dy
for line in range(0,df.shape[0]):
ax.text(df[x][line]+movex, df[y][line]+movey, df[labelvar][line], horizontalalignment='left', fontsize=label_font_size, color='black')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
if xpct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
xticks = mtick.FormatStrFormatter(fmt)
ax.xaxis.set_major_formatter(xticks)
if ypct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt)
ax.yaxis.set_major_formatter(yticks)
if ylogscale:
ax.set(yscale="log")
if xlogscale:
ax.set(xscale="log")
handles, labels = ax.get_legend_handles_labels()
handles = np.array(handles)
labels = np.array(labels)
handles = list(handles[(labels!=hue) & (labels!=style) & (labels!=size)])
labels = list(labels[(labels!=hue) & (labels!=style) & (labels!=size)])
ax.legend(handles=handles, labels=labels, fontsize=legend_fontsize)
if save:
plt.savefig(pathgraphs + filename, dpi=300, bbox_inches='tight')
return fig
df['year'] = df.year.astype(int)
df['gdp_pc'] = df['NY.GDP.PCAP.PP.KD']
df['ln_gdp_pc'] = df['NY.GDP.PCAP.PP.KD'].apply(np.log)
df['ln_pop'] = df['SP.POP.TOTL'].apply(np.log)
df['name'] = df.name.str.strip()
df['incomeLevel'] = df['incomeLevel'].str.title()
df.loc[df.iso3c=='VEN', 'incomeLevel'] = 'Upper Middle Income'
years = [1990, 1995, 2000, 2010, 2020]
df_gdp_pc_patents = df[df['year'].isin(years)].dropna(subset=['ln_gdp_pc','ln_pop'])
df_gdp_pc_patents = df_gdp_pc_patents.sort_values(by='region').reset_index()
df_gdp_pc_patents['total_patents'] = df_gdp_pc_patents['total_patents'].apply(np.log)
df_gdp_pc_patents
| level_0 | iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | ... | NY.GDP.PCAP.PP.KD | NY.GDP.PCAP.KD | SL.GDP.PCAP.EM.KD | SP.POP.GROW | SP.POP.TOTL | SP.DYN.WFRT | SP.DYN.TFRT.IN | gdp_pc | ln_gdp_pc | ln_pop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1320 | HKG | HK | Hong Kong SAR, China | East Asia & Pacific | High Income | Not classified | 114.1090 | ... | 55917.647464 | 41469.975506 | 114163.585019 | -0.358933 | 7.481000e+06 | NaN | 0.868 | 55917.647464 | 10.931635 | 15.827877 | ||
| 1 | 1481 | IDN | ID | Indonesia | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IBRD | Jakarta | 106.8300 | ... | 5689.260223 | 1867.548796 | 12898.669096 | 1.379908 | 2.115138e+08 | NaN | 2.512 | 5689.260223 | 8.646336 | 19.169801 |
| 2 | 1486 | IDN | ID | Indonesia | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IBRD | Jakarta | 106.8300 | ... | 5892.071754 | 1934.123433 | 13931.907937 | 1.543736 | 1.969343e+08 | NaN | 2.688 | 5892.071754 | 8.681363 | 19.098381 |
| 3 | 603 | CHN | CN | China | East Asia & Pacific | East Asia & Pacific (excluding high income) | Upper Middle Income | IBRD | Beijing | 116.2860 | ... | 3451.679231 | 2193.892991 | 6175.400336 | 0.787957 | 1.262645e+09 | NaN | 1.596 | 3451.679231 | 8.146616 | 20.956475 |
| 4 | 3022 | SGP | SG | Singapore | East Asia & Pacific | High Income | Not classified | Singapore | 103.8500 | ... | 55904.233600 | 34890.715594 | 108220.252596 | 1.732042 | 4.027887e+06 | NaN | 1.600 | 55904.233600 | 10.931395 | 15.208752 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 1078 | ETH | ET | Ethiopia | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Low Income | IDA | Addis Ababa | 38.7468 | ... | 727.766685 | 262.025313 | 1755.764632 | 2.882688 | 6.622481e+07 | 4.7 | 6.543 | 727.766685 | 6.589981 | 18.008566 |
| 435 | 1064 | ETH | ET | Ethiopia | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Low Income | IDA | Addis Ababa | 38.7468 | ... | 2296.890440 | 826.973053 | 5096.388409 | 2.541386 | 1.149636e+08 | NaN | 4.049 | 2296.890440 | 7.739312 | 18.560126 |
| 436 | 3352 | UGA | UG | Uganda | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Low Income | IDA | Kampala | 32.5729 | ... | 2175.031098 | 891.295904 | 6202.214476 | 3.269713 | 4.574100e+07 | NaN | 4.703 | 2175.031098 | 7.684798 | 17.638506 |
| 437 | 620 | COD | CD | Congo, Dem. Rep. | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Low Income | IDA | Kinshasa | 15.3222 | ... | 1082.445242 | 505.348339 | 3252.262449 | 3.142652 | 8.956140e+07 | NaN | 5.718 | 1082.445242 | 6.986978 | 18.310435 |
| 438 | 3662 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower Middle Income | Blend | Harare | 31.0672 | ... | 2652.129209 | 1623.930176 | NaN | 2.706407 | 1.043241e+07 | NaN | 4.862 | 2652.129209 | 7.883118 | 16.160428 |
439 rows × 27 columns
g = my_xy_plot(df_gdp_pc_patents,
x='ln_gdp_pc',
y='total_patents',
xlabel='Log[GDP per capita]',
ylabel='Total Patents',
OLS=True,
labels=True,
# ylogscale = True,
#size="ln_pop",
#sizes=(10, 400),
filename='ln-gdp-pc-total-patents.pdf')
df.ln_gdp_pc.mean()
9.610718757334599
my_xy_line_plot function plot the evolution of GDP per capita and total patents by income groups and regions (separate figures).
dfin = df.dropna(subset=['ln_gdp_pc','ln_pop'])
dfin = dfin.sort_values(by='region').reset_index()
dfin['total_patents'] = dfin['total_patents'].apply(np.log)
dfin
| level_0 | iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | ... | NY.GDP.PCAP.PP.KD | NY.GDP.PCAP.KD | SL.GDP.PCAP.EM.KD | SP.POP.GROW | SP.POP.TOTL | SP.DYN.WFRT | SP.DYN.TFRT.IN | gdp_pc | ln_gdp_pc | ln_pop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1903 | KOR | KR | Korea, Rep. | East Asia & Pacific | High Income | Not classified | Seoul | 126.9570 | ... | 32363.968659 | 23948.472244 | 65571.756947 | 0.514683 | 49307835.0 | NaN | 1.149 | 32363.968659 | 10.384801 | 17.713594 | |
| 1 | 2717 | PHL | PH | Philippines | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IBRD | Manila | 121.0350 | ... | 7300.136210 | 3001.043182 | 18110.362690 | 1.579363 | 102113206.0 | NaN | 2.805 | 7300.136210 | 8.895648 | 18.441593 |
| 2 | 2718 | PHL | PH | Philippines | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IBRD | Manila | 121.0350 | ... | 6973.638909 | 2866.822056 | 17276.530044 | 1.646682 | 100513137.0 | NaN | 2.894 | 6973.638909 | 8.849892 | 18.425799 |
| 3 | 2719 | PHL | PH | Philippines | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IBRD | Manila | 121.0350 | ... | 6666.250512 | 2740.456489 | 16772.194364 | 1.692088 | 98871558.0 | 2.2 | 2.979 | 6666.250512 | 8.804813 | 18.409332 |
| 4 | 2720 | PHL | PH | Philippines | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IBRD | Manila | 121.0350 | ... | 6351.264942 | 2610.967768 | 15982.714520 | 1.704126 | 97212639.0 | NaN | 3.055 | 6351.264942 | 8.756409 | 18.392411 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2842 | 2390 | MUS | MU | Mauritius | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Upper Middle Income | IBRD | Port Louis | 57.4977 | ... | 9998.617114 | 4653.437938 | 25785.129406 | 1.022765 | 1133996.0 | NaN | 2.120 | 9998.617114 | 9.210202 | 13.941258 |
| 2843 | 2389 | MUS | MU | Mauritius | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Upper Middle Income | IBRD | Port Louis | 57.4977 | ... | 10435.798365 | 4856.905657 | 26837.667976 | 1.252098 | 1148284.0 | NaN | 2.040 | 10435.798365 | 9.252997 | 13.953779 |
| 2844 | 2388 | MUS | MU | Mauritius | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Upper Middle Income | IBRD | Port Louis | 57.4977 | ... | 10953.676885 | 5097.930543 | 28060.383468 | 1.051422 | 1160421.0 | NaN | 1.970 | 10953.676885 | 9.301430 | 13.964293 |
| 2845 | 2400 | MWI | MW | Malawi | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Low Income | IDA | Lilongwe | 33.7703 | ... | 1127.195856 | 294.343918 | 2866.782081 | 2.676386 | 11148751.0 | 5.2 | 6.098 | 1127.195856 | 7.027488 | 16.226838 |
| 2846 | 3662 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower Middle Income | Blend | Harare | 31.0672 | ... | 2652.129209 | 1623.930176 | NaN | 2.706407 | 10432409.0 | NaN | 4.862 | 2652.129209 | 7.883118 | 16.160428 |
2847 rows × 27 columns
def my_xy_line_plot(dfin,
x='year',
y='ln_gdp_pc',
labelvar='iso3c',
dx=0.006125,
dy=0.006125,
xlogscale=False,
ylogscale=False,
xlabel='Growth Rate of Population',
ylabel='Log[Income per capita in ' + str(year) + ']',
labels=False,
xpct = False,
ypct = False,
OLS=False,
OLSlinelabel='OLS',
ssline=False,
sslinelabel='45 Degree Line',
filename='income-pop-growth.pdf',
hue='region',
hue_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
style='incomeLevel',
style_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=None,
legend_fontsize=10,
label_fontsize=12,
loc=None,
save=True):
'''
Plot the association between x and var in dataframe using labelvar for labels.
'''
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set_context("talk")
df = dfin.copy()
df = df.dropna(subset=[x, y]).reset_index(drop=True)
# Plot
k = 0
fig, ax = plt.subplots()
sns.lineplot(x=x, y=y, data=df, ax=ax,
hue=hue,
hue_order=hue_order,
alpha=1,
style=style,
style_order=style_order,
palette=palette,
)
if OLS:
sns.regplot(x=x, y=y, data=df, ax=ax, label=OLSlinelabel, scatter=False)
if ssline:
ax.plot([df[x].min()*.99, df[x].max()*1.01], [df[x].min()*.99, df[x].max()*1.01], c='r', label=sslinelabel)
if labels:
movex = df[x].mean() * dx
movey = df[y].mean() * dy
for line in range(0,df.shape[0]):
ax.text(df[x][line]+movex, df[y][line]+movey, df[labelvar][line], horizontalalignment='left', fontsize=label_fontsize, color='black')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
if xpct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
xticks = mtick.FormatStrFormatter(fmt)
ax.xaxis.set_major_formatter(xticks)
if ypct:
fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
yticks = mtick.FormatStrFormatter(fmt)
ax.yaxis.set_major_formatter(yticks)
if ylogscale:
ax.set(yscale="log")
if xlogscale:
ax.set(xscale="log")
handles, labels = ax.get_legend_handles_labels()
handles = np.array(handles)
labels = np.array(labels)
handles = list(handles[(labels!='region') & (labels!='incomeLevel')])
labels = list(labels[(labels!='region') & (labels!='incomeLevel')])
ax.legend(handles=handles, labels=labels, fontsize=legend_fontsize, loc=loc)
if save:
plt.savefig(pathgraphs + filename, dpi=300, bbox_inches='tight')
return fig
palette=sns.color_palette("Blues_r", df['incomeLevel'].unique().shape[0]+6)[:df['incomeLevel'].unique().shape[0]*2:2]
fig = my_xy_line_plot(dfin,
x='total_patents',
y='ln_gdp_pc',
xlabel='Total Patents',
ylabel='Log[GDP per capita]',
filename='ln-gdp-pc-ls-total-patents.pdf',
hue='incomeLevel',
hue_order=['High Income', 'Upper Middle Income', 'Lower Middle Income', 'Low Income'],
palette=palette,
OLS=False,
labels=False,
legend_fontsize=16,
loc='lower right',
save=True,
ylogscale = True,)
fig = my_xy_line_plot(dfin,
x='total_patents',
y='gdp_pc',
xlabel='Total Patents',
ylabel='GDP per capita',
ylogscale=True,
filename='ln-gdp-pc-regions-TS.pdf',
style='region',
style_order=['East Asia & Pacific', 'Europe & Central Asia',
'Latin America & Caribbean ', 'Middle East & North Africa',
'North America', 'South Asia', 'Sub-Saharan Africa '],
#palette=palette,
OLS=False,
labels=False,
legend_fontsize=12,
loc='lower right',
save=True)
df_2015 = df[df['year'] == 2015]
df_2015
| iso3c | iso2c | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | ... | NY.GDP.PCAP.PP.KD | NY.GDP.PCAP.KD | SL.GDP.PCAP.EM.KD | SP.POP.GROW | SP.POP.TOTL | SP.DYN.WFRT | SP.DYN.TFRT.IN | gdp_pc | ln_gdp_pc | ln_pop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | ALB | AL | Albania | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper Middle Income | IBRD | Tirane | 19.8172 | 41.3317 | ... | 11878.454448 | 3952.802538 | 31777.707846 | -0.291206 | 2880703.0 | NaN | 1.677 | 11878.454448 | 9.382481 | 14.873545 |
| 18 | ARE | AE | United Arab Emirates | Middle East & North Africa | High Income | Not classified | Abu Dhabi | 54.3705 | 24.4764 | ... | 65267.415127 | 38663.388256 | 96389.979731 | 0.527292 | 9262896.0 | NaN | 1.541 | 65267.415127 | 11.086248 | 16.041527 | |
| 29 | ARG | AR | Argentina | Latin America & Caribbean | Latin America & Caribbean (excluding high income) | Upper Middle Income | IBRD | Buenos Aires | -58.4173 | -34.6118 | ... | 23933.886612 | 13789.060425 | 58423.306693 | 1.078001 | 43131966.0 | NaN | 2.301 | 23933.886612 | 10.083051 | 17.579775 |
| 65 | ARM | AM | Armenia | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper Middle Income | IBRD | Yerevan | 44.5090 | 40.1596 | ... | 11321.332540 | 3607.289299 | 30479.226034 | 0.450706 | 2925559.0 | NaN | 1.738 | 11321.332540 | 9.334444 | 14.888996 |
| 93 | AUS | AU | Australia | East Asia & Pacific | High Income | Not classified | Canberra | 149.1290 | -35.2820 | ... | 47569.294602 | 56707.022077 | 96035.429492 | 1.439217 | 23815995.0 | NaN | 1.814 | 47569.294602 | 10.769943 | 16.985868 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3559 | WSM | WS | Samoa | East Asia & Pacific | East Asia & Pacific (excluding high income) | Lower Middle Income | IDA | Apia | -171.7520 | -13.8314 | ... | 5993.452243 | 4073.729083 | 24570.322725 | 0.668864 | 193510.0 | NaN | 4.029 | 5993.452243 | 8.698423 | 12.173084 |
| 3565 | YEM | YE | Yemen, Rep. | Middle East & North Africa | Middle East & North Africa (excluding high inc... | Low Income | IDA | Sana'a | 44.2075 | 15.3520 | ... | NaN | 1601.807163 | NaN | 2.578030 | 26497881.0 | NaN | 4.103 | NaN | NaN | 17.092575 |
| 3586 | ZAF | ZA | South Africa | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Upper Middle Income | IBRD | Pretoria | 28.1871 | -25.7460 | ... | 14010.104418 | 6259.839681 | 48490.701443 | 1.532243 | 55386369.0 | NaN | 2.484 | 14010.104418 | 9.547534 | 17.829844 |
| 3627 | ZMB | ZM | Zambia | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Low Income | IDA | Lusaka | 28.2937 | -15.3982 | ... | 3443.553254 | 1338.290927 | 9566.033479 | 3.066671 | 15879370.0 | NaN | 4.918 | 3443.553254 | 8.144259 | 16.580531 |
| 3656 | ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower Middle Income | Blend | Harare | 31.0672 | -17.8312 | ... | 2360.022385 | 1445.069702 | 5103.199920 | 1.663694 | 13814642.0 | 3.6 | 3.896 | 2360.022385 | 7.766426 | 16.441240 |
111 rows × 26 columns
g = my_xy_plot(df_2015,
x='IP.PAT.NRES',
y='IP.PAT.RESD',
xlabel='Residential Patents',
ylabel='Non-Residential Patents',
OLS=True,
labels=True,
# ylogscale = True,
# xlogscale = True,
#size="ln_pop",
#sizes=(10, 400),
filename='res-vs-non-res-patents.pdf')
df.columns
Index(['iso3c', 'iso2c', 'name', 'region', 'adminregion', 'incomeLevel',
'lendingType', 'capitalCity', 'longitude', 'latitude', 'index',
'country', 'year', 'IP.PAT.RESD', 'IP.PAT.NRES', 'total_patents',
'NY.GDP.PCAP.PP.KD', 'NY.GDP.PCAP.KD', 'SL.GDP.PCAP.EM.KD',
'SP.POP.GROW', 'SP.POP.TOTL', 'SP.DYN.WFRT', 'SP.DYN.TFRT.IN', 'gdp_pc',
'ln_gdp_pc', 'ln_pop'],
dtype='object')
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip'
r = requests.get(url, headers=headers)
countries = gp.read_file(io.BytesIO(r.content))
# fig, ax = plt.subplots(figsize=(15,10))
# countries.plot(ax=ax)
# ax.set_title("WGS84 (lat/lon)", fontdict={'fontsize':34})
df_2015 = countries.merge(df_2015, left_on='ADM0_A3', right_on='iso3c')
fig, ax = plt.subplots(figsize=(15,10))
df_2015.plot(column='total_patents', ax=ax, cmap='Reds')
ax.set_title("2015 Patents", fontdict={'fontsize':34})
Text(0.5, 1.0, '2015 Patents')
scheme = mc.Quantiles(df_2015['total_patents'], k=5)
classifier = mc.Quantiles.make(k=5, rolling=True)
df_2015['total_patents_q'] = classifier(df_2015['total_patents'])
df_2015['total_patents_qc'] = df_2015['total_patents_q'].apply(lambda x: scheme.get_legend_classes()[x].replace('[ ', '[').replace('( ', '('))
fig = px.choropleth(df_2015.sort_values('total_patents', ascending=True),
locations="iso3c",
color="total_patents_qc",
hover_name='name',
hover_data=['iso3c', 'ln_pop'],
labels={
"total_patents": "Total Patents (" + str(year) + ")",
},
color_discrete_sequence=px.colors.sequential.Reds,
height=600,
width=1000,
)
# Change legend position
fig.update_layout(legend=dict(
yanchor="bottom",
y=0.15,
xanchor="left",
x=0.05
))
fig.show()
fig = go.Figure(data=go.Choropleth(
locations = df_2015['iso3c'],
z = df_2015['total_patents'],
text = df_2015['name'],
colorscale = 'Blues',
autocolorscale=False,
reversescale=True,
marker_line_color='darkgray',
marker_line_width=0.5,
colorbar_tickprefix = '',
colorbar_title = 'Total Patents',
)
)
fig.update_layout(
autosize=False,
width=800,
height=400,
margin=dict(
l=5,
r=5,
b=10,
t=10,
pad=1
),
paper_bgcolor="LightSteelBlue",
)
fig.show()
filtered_df = df[['year', 'name', 'ln_gdp_pc', 'total_patents', 'IP.PAT.RESD','IP.PAT.NRES', 'region', 'incomeLevel', 'iso3c']]
filtered_df
| year | name | ln_gdp_pc | total_patents | IP.PAT.RESD | IP.PAT.NRES | region | incomeLevel | iso3c | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019 | Angola | 8.811655 | 110.0 | 2.0 | 108.0 | Sub-Saharan Africa | Lower Middle Income | AGO |
| 1 | 2018 | Angola | 8.851109 | 120.0 | 6.0 | 114.0 | Sub-Saharan Africa | Lower Middle Income | AGO |
| 2 | 1992 | Angola | 8.542171 | 6.0 | 4.0 | 2.0 | Sub-Saharan Africa | Lower Middle Income | AGO |
| 3 | 2019 | Albania | 9.521729 | 5.0 | 4.0 | 1.0 | Europe & Central Asia | Upper Middle Income | ALB |
| 4 | 2018 | Albania | 9.496804 | 18.0 | 15.0 | 3.0 | Europe & Central Asia | Upper Middle Income | ALB |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3668 | 1984 | Zimbabwe | NaN | 225.0 | 34.0 | 191.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
| 3669 | 1983 | Zimbabwe | NaN | 277.0 | 40.0 | 237.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
| 3670 | 1982 | Zimbabwe | NaN | 271.0 | 41.0 | 230.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
| 3671 | 1981 | Zimbabwe | NaN | 309.0 | 35.0 | 274.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
| 3672 | 1980 | Zimbabwe | NaN | 320.0 | 39.0 | 281.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
3673 rows × 9 columns
df.columns
Index(['iso3c', 'iso2c', 'name', 'region', 'adminregion', 'incomeLevel',
'lendingType', 'capitalCity', 'longitude', 'latitude', 'index',
'country', 'year', 'IP.PAT.RESD', 'IP.PAT.NRES', 'total_patents',
'NY.GDP.PCAP.PP.KD', 'NY.GDP.PCAP.KD', 'SL.GDP.PCAP.EM.KD',
'SP.POP.GROW', 'SP.POP.TOTL', 'SP.DYN.WFRT', 'SP.DYN.TFRT.IN', 'gdp_pc',
'ln_gdp_pc', 'ln_pop'],
dtype='object')
# Resident, Non-Res, Year, Country, GDP_PC
# g = my_xy_plot(filtered_df,
# x='ln_gdp_pc',
# y='total_patents',
# xlabel='Total GDP per Country',
# ylabel='Total Patents',
# OLS=True,
# # labels=True,
# filename='gdp_vs_patents_relations.pdf')
# Resident vs GDP_PC (Bar plot)
print(filtered_df.ln_gdp_pc.min())
print(filtered_df.ln_gdp_pc.max())
6.461862654729543 11.995175454631555
# bins = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
# out = pd.cut(filtered_df.ln_gdp_pc.values, bins = bins, include_lowest = True)
# ax = out.value_counts().plot.bar(rot = 0, color ='b', figsize = (10,5))
# plt.show()
filtered_df
| year | name | ln_gdp_pc | total_patents | IP.PAT.RESD | IP.PAT.NRES | region | incomeLevel | iso3c | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019 | Angola | 8.811655 | 110.0 | 2.0 | 108.0 | Sub-Saharan Africa | Lower Middle Income | AGO |
| 1 | 2018 | Angola | 8.851109 | 120.0 | 6.0 | 114.0 | Sub-Saharan Africa | Lower Middle Income | AGO |
| 2 | 1992 | Angola | 8.542171 | 6.0 | 4.0 | 2.0 | Sub-Saharan Africa | Lower Middle Income | AGO |
| 3 | 2019 | Albania | 9.521729 | 5.0 | 4.0 | 1.0 | Europe & Central Asia | Upper Middle Income | ALB |
| 4 | 2018 | Albania | 9.496804 | 18.0 | 15.0 | 3.0 | Europe & Central Asia | Upper Middle Income | ALB |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3668 | 1984 | Zimbabwe | NaN | 225.0 | 34.0 | 191.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
| 3669 | 1983 | Zimbabwe | NaN | 277.0 | 40.0 | 237.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
| 3670 | 1982 | Zimbabwe | NaN | 271.0 | 41.0 | 230.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
| 3671 | 1981 | Zimbabwe | NaN | 309.0 | 35.0 | 274.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
| 3672 | 1980 | Zimbabwe | NaN | 320.0 | 39.0 | 281.0 | Sub-Saharan Africa | Lower Middle Income | ZWE |
3673 rows × 9 columns
plt.plot(filtered_df['ln_gdp_pc'], filtered_df['total_patents'])
plt.xlabel("Log [GDP Per Capita]")
plt.ylabel("Total Patents")
plt.title("Total Patents vs Log [GDP Per Capita]")
plt.show()
plt.plot(filtered_df['ln_gdp_pc'], filtered_df['IP.PAT.NRES'])
plt.xlabel("Log [GDP Per Capita]")
plt.ylabel("Non-Residential Patents")
plt.title("Non-Residential Patents vs Log [GDP Per Capita]")
plt.show()
plt.plot(filtered_df['ln_gdp_pc'], filtered_df['IP.PAT.RESD'])
plt.xlabel("Log [GDP Per Capita]")
plt.ylabel("Residential Patents")
plt.title("Residential Patents vs Log [GDP Per Capita]")
plt.show()
Notebook written by Ömer Özak for his students in Economics at Southern Methodist University. Feel free to use, distribute, or contribute.